library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.6.3
## -- Attaching packages ------------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.0 v purrr 0.3.4
## v tibble 3.0.1 v dplyr 0.8.5
## v tidyr 1.0.2 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## Warning: package 'ggplot2' was built under R version 3.6.3
## Warning: package 'tibble' was built under R version 3.6.3
## Warning: package 'tidyr' was built under R version 3.6.3
## Warning: package 'purrr' was built under R version 3.6.3
## Warning: package 'dplyr' was built under R version 3.6.3
## Warning: package 'forcats' was built under R version 3.6.3
## -- Conflicts ---------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(gt)
## Warning: package 'gt' was built under R version 3.6.3
data("flights", package = "pnwflights14")
library(DT)
## Warning: package 'DT' was built under R version 3.6.3
flight_PDX<-flights%>%
filter(origin=="PDX")
datatable(flight_PDX,rownames = FALSE)
## Warning in instance$preRenderHook(instance): It seems your data is too big
## for client-side DataTables. You may consider server-side processing: https://
## rstudio.github.io/DT/server.html
library(tidyverse)
track_record_PDX<-flight_PDX%>%
group_by(month,carrier)%>%
summarize(Avg_dep_delay=mean(dep_delay,na.rm = TRUE))%>%
arrange(month,Avg_dep_delay)
datatable(track_record_PDX,rownames = FALSE)
Which airlines had the best and worst track records of on-time departures in each month? Is it different between PDX and SEA?
Average_track_PDX<-track_record_PDX%>%
group_by(month) %>%
slice(which.min(Avg_dep_delay),which.max(Avg_dep_delay))
Average_track_PDX%>%gt(groupname_col = "month") %>%
tab_header(title="PDX-Best and worst carrier in depature delay each month")%>%
cols_label(month="Month",carrier="Carrier",Avg_dep_delay="Average dep delay")%>%
cols_align(align="center") %>%
tab_style(
cell_text(style = "italic"),
locations = cells_title(groups=c("title"))
)
| Carrier |
Average dep delay |
| 1 |
| HA |
-2.451613 |
| F9 |
21.100000 |
| 2 |
| HA |
-2.178571 |
| VX |
28.631579 |
| 3 |
| HA |
-2.967742 |
| AA |
24.852459 |
| 4 |
| HA |
-6.133333 |
| AA |
16.738889 |
| 5 |
| AS |
-2.354346 |
| WN |
11.215622 |
| 6 |
| HA |
-3.833333 |
| F9 |
18.113636 |
| 7 |
| HA |
-2.483871 |
| WN |
13.844753 |
| 8 |
| HA |
-4.322581 |
| AA |
9.504673 |
| 9 |
| VX |
-2.333333 |
| HA |
15.366667 |
| 10 |
| VX |
-5.129032 |
| AA |
13.701754 |
| 11 |
| HA |
1.200000 |
| WN |
8.688581 |
| 12 |
| HA |
-1.354839 |
| AA |
19.365854 |
The above table shows the best and worst airline departure delay every month.It seems in month 1st,best is HA whereas worst is F9, similarly in 2nd month, best is HA and worst is VX
flight_SEA<-flights%>%
filter(origin=="SEA")
datatable(flight_SEA)
## Warning in instance$preRenderHook(instance): It seems your data is too big
## for client-side DataTables. You may consider server-side processing: https://
## rstudio.github.io/DT/server.html
track_record_SEA<-flight_SEA%>%
group_by(month,carrier)%>%
summarize(Avg_dep_delay=mean(dep_delay,na.rm = TRUE))%>%
arrange(month,Avg_dep_delay)
datatable(track_record_SEA,rownames = FALSE)
Average_track_SEA<-track_record_SEA%>%
group_by(month) %>%
slice(which.min(Avg_dep_delay),which.max(Avg_dep_delay))
Average_track_SEA%>%gt() %>%
tab_header(title="SEA-Best and worst carrier in depature delay each month")%>%
cols_label(month="Month",carrier="Carrier",Avg_dep_delay="Average dep delay")%>%
cols_align(align="center") %>%
tab_style(
cell_text(style = "italic"),
locations = cells_title(groups=c("title"))
)
| Carrier |
Average dep delay |
| 1 |
| AS |
1.85991262 |
| F9 |
22.57843137 |
| 2 |
| HA |
0.33928571 |
| F9 |
24.66666667 |
| 3 |
| HA |
2.19354839 |
| WN |
11.42794279 |
| 4 |
| US |
0.72830189 |
| F9 |
12.56310680 |
| 5 |
| HA |
-0.12903226 |
| WN |
12.46680080 |
| 6 |
| HA |
-3.15000000 |
| WN |
21.21019608 |
| 7 |
| HA |
0.80645161 |
| F9 |
21.71014493 |
| 8 |
| OO |
3.40660737 |
| HA |
19.63934426 |
| 9 |
| US |
0.03010033 |
| WN |
9.41641939 |
| 10 |
| HA |
-2.12903226 |
| AA |
18.92592593 |
| 11 |
| US |
2.33333333 |
| UA |
11.55710660 |
| 12 |
| US |
2.43824701 |
| VX |
31.45374449 |
The best and worst track records of on-time departures of airlines in each month is different for PDX and SEA. For each city Portland and Seattle, it seems every month the position of best and worst airlines are getting changed, showing no consistency.
What cities have the most service from Portland (defined however you like, but do make sure to define it clearly!), and which have the worst?
most_service<-flight_PDX%>%
group_by(dest)%>%
summarise(flight=n())%>%
arrange(desc(flight))
most_service%>%gt() %>%
tab_header(title="PDX-Number of flights in each destination")%>%
cols_label(dest="Destination",flight="Number of flights")%>%
cols_align(align="center") %>%
tab_style(
cell_text(align = "right"),
locations = cells_title(groups = c("subtitle"))
) %>%
tab_style(
cell_text(style = "italic"),
locations = cells_title(groups=c("title"))
)
| Destination |
Number of flights |
| SFO |
5179 |
| DEN |
3940 |
| PHX |
3570 |
| LAX |
3001 |
| ORD |
2501 |
| LAS |
2482 |
| SLC |
2465 |
| SJC |
2368 |
| SEA |
2289 |
| OAK |
1944 |
| DFW |
1884 |
| SMF |
1818 |
| ATL |
1558 |
| SAN |
1550 |
| MSP |
1362 |
| BUR |
1027 |
| SNA |
1023 |
| LGB |
1008 |
| ANC |
1000 |
| IAH |
998 |
| ONT |
925 |
| JFK |
854 |
| RDM |
731 |
| HNL |
730 |
| MDW |
676 |
| EUG |
636 |
| OGG |
605 |
| EWR |
510 |
| BOS |
504 |
| IAD |
371 |
| DCA |
365 |
| MCI |
365 |
| SBA |
365 |
| TUS |
365 |
| DTW |
310 |
| ABQ |
299 |
| CLT |
277 |
| PSP |
271 |
| PHL |
180 |
| KOA |
168 |
| RNO |
164 |
| LMT |
154 |
| LIH |
103 |
| BOI |
98 |
| FAI |
93 |
| BWI |
86 |
| AUS |
63 |
| HOU |
63 |
| STL |
37 |
max(most_service$flight)
## [1] 5179
From the above table, it shows SFO or San Fransisco has highest number of services from portland.STL has the lowest or worst number of services from PORTLAND.
Which airlines improved the most in terms of on-time departures over time, and on which routes? Which airlines got worse?
xtabs(Avg_dep_delay~carrier+month,track_record_PDX)
## month
## carrier 1 2 3 4 5 6
## AA 8.37158470 14.98125000 24.85245902 16.73888889 10.91959799 17.12500000
## AS -1.18441815 2.48504983 -0.33514493 -1.72155412 -2.35434575 0.30747664
## B6 9.10843373 9.25352113 -1.37974684 5.91764706 4.89565217 1.40157480
## DL 4.05121294 9.17571885 2.09026128 -0.51288056 1.04444444 2.96007984
## F9 21.10000000 13.79166667 6.11403509 2.67889908 7.64444444 18.11363636
## HA -2.45161290 -2.17857143 -2.96774194 -6.13333333 -1.64516129 -3.83333333
## OO 5.80552712 7.90307868 3.21081577 1.26673327 2.33487085 3.99004975
## UA 9.12328767 15.69393140 5.80470588 1.46421268 7.90820312 8.21190893
## US 4.36416185 1.06711409 -1.70370370 1.74626866 -1.21363636 2.20689655
## VX -1.81609195 28.63157895 4.70588235 4.08139535 4.30337079 7.68852459
## WN 15.82699387 17.22971114 10.31091510 11.33922652 11.21562156 16.84363296
## month
## carrier 7 8 9 10 11 12
## AA 6.90521327 9.50467290 8.90640394 13.70175439 8.61600000 19.36585366
## AS 1.49910072 2.36147757 0.92829457 0.02506964 2.16844920 7.36274510
## B6 11.67955801 6.30000000 7.38947368 2.83516484 8.20000000 3.21590909
## DL 3.66785080 2.73713235 2.21250000 0.24129353 2.80169972 2.25490196
## F9 9.42028986 1.98540146 4.76422764 -0.81415929 5.16666667 14.18181818
## HA -2.48387097 -4.32258065 15.36666667 1.35483871 1.20000000 -1.35483871
## OO 4.18908629 3.66990291 2.09876543 2.62011173 7.50389105 10.82581967
## UA 6.45563140 6.94627383 5.44503546 7.40105079 5.42600897 10.79600887
## US 4.32644628 0.46521739 0.26288660 -0.55801105 3.10691824 4.55688623
## VX 7.29032258 6.45161290 -2.33333333 -5.12903226 4.44827586 7.13333333
## WN 13.84475282 9.33173996 6.93010753 6.78040904 8.68858131 18.12652608
first_six_month<-flight_PDX%>%
filter(month<7)
first_six_month_tidy<-first_six_month%>%
group_by(carrier)%>%
summarize(jan_to_june=median(dep_delay,na.rm = TRUE))
first_six_month_tidy%>%gt() %>%
tab_header(title="PDX-dep_delay average from Jan to June")%>%
cols_label(carrier="Carrier",jan_to_june="Jan to June")%>%
cols_align(align="center") %>%
tab_style(
cell_text(align = "right"),
locations = cells_title(groups = c("subtitle"))
) %>%
tab_style(
cell_text(style = "italic"),
locations = cells_title(groups=c("title"))
)
| Carrier |
Jan to June |
| AA |
-2 |
| AS |
-5 |
| B6 |
-4 |
| DL |
-3 |
| F9 |
-1 |
| HA |
-6 |
| OO |
-4 |
| UA |
-2 |
| US |
-4 |
| VX |
-4 |
| WN |
2 |
last_six_month<-flight_PDX%>%
filter(month>6)
last_six_month_tidy<-last_six_month%>%
group_by(carrier)%>%
summarize(july_to_dec=median(dep_delay,na.rm = TRUE))
last_six_month_tidy%>%gt() %>%
tab_header(title="PDX-dep_delay average from July to Dec")%>%
cols_label(carrier="Carrier",july_to_dec="July to Dec")%>%
cols_align(align="center") %>%
tab_style(
cell_text(align = "right"),
locations = cells_title(groups = c("subtitle"))
) %>%
tab_style(
cell_text(style = "italic"),
locations = cells_title(groups=c("title"))
)
| Carrier |
July to Dec |
| AA |
-2 |
| AS |
-4 |
| B6 |
-3 |
| DL |
-3 |
| F9 |
-4 |
| HA |
-4 |
| OO |
-4 |
| UA |
-1 |
| US |
-3 |
| VX |
-3 |
| WN |
1 |
improvement<-merge(first_six_month_tidy,last_six_month_tidy,by="carrier")
improvement_final<-improvement%>%
mutate(improvement_factor=july_to_dec-jan_to_june)%>%
arrange(improvement_factor)
improvement_final%>%gt() %>%
tab_header(title="PDX-Airlines improvement in dep_delay average ")%>%
cols_label(carrier="Carrier",jan_to_june="Jan to June",july_to_dec="July to Dec",improvement_factor="Improvement factor")%>%
cols_align(align="center") %>%
tab_style(
cell_text(align = "right"),
locations = cells_title(groups = c("subtitle"))
) %>%
tab_style(
cell_text(style = "italic"),
locations = cells_title(groups=c("title"))
)
| Carrier |
Jan to June |
July to Dec |
Improvement factor |
| F9 |
-1 |
-4 |
-3 |
| WN |
2 |
1 |
-1 |
| AA |
-2 |
-2 |
0 |
| DL |
-3 |
-3 |
0 |
| OO |
-4 |
-4 |
0 |
| AS |
-5 |
-4 |
1 |
| B6 |
-4 |
-3 |
1 |
| UA |
-2 |
-1 |
1 |
| US |
-4 |
-3 |
1 |
| VX |
-4 |
-3 |
1 |
| HA |
-6 |
-4 |
2 |
From the above table it shows that F9 improved a lot whereas HA has worst delay rate.
Using kable() function:
library(knitr)
## Warning: package 'knitr' was built under R version 3.6.3
library(kableExtra)
## Warning: package 'kableExtra' was built under R version 3.6.3
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
first_Six<-flight_PDX%>%
filter((carrier=="F9" | carrier=="HA")& month<7 )%>%
group_by(dest,carrier)%>%
summarize(first_avg=median(dep_delay,na.rm = TRUE))
column1=c("Destination","Carrier","First 6 month average")
first_Six%>%
kable(format = "html", digits = 2, caption = "First six month average departure delay",col.names = column1) %>%
kable_styling(bootstrap_options = "striped", full_width = F, position = "center")
First six month average departure delay
|
Destination
|
Carrier
|
First 6 month average
|
|
DEN
|
F9
|
-1
|
|
HNL
|
HA
|
-6
|
|
STL
|
F9
|
-1
|
last_Six<-flight_PDX%>%
filter((carrier=="F9" | carrier=="HA")& month>6)%>%
group_by(dest,carrier)%>%
summarize(second_avg=median(dep_delay,na.rm = TRUE))
column2=c("Destination","Carrier","Last 6 month average")
last_Six%>%
kable(format = "html", caption = "Last six month average departure delay",col.names = column2) %>%
kable_styling(bootstrap_options = "striped", full_width = F, position = "center")
Last six month average departure delay
|
Destination
|
Carrier
|
Last 6 month average
|
|
DEN
|
F9
|
-4
|
|
HNL
|
HA
|
-4
|
|
STL
|
F9
|
-9
|
Finding out best and worst route for F9 and HA
final<-full_join(first_Six, last_Six, by = c("dest", "carrier"))%>%
mutate(improvement_factor=second_avg-first_avg)
column=c("Destination","Carrier","First six month","Last 6 month","Improvement")
final%>% kable(format = "html", digits = 2, caption = "Improvement in average departure delay",col.names = column) %>%
kable_styling(bootstrap_options = "striped", full_width = F, position = "center")
Improvement in average departure delay
|
Destination
|
Carrier
|
First six month
|
Last 6 month
|
Improvement
|
|
DEN
|
F9
|
-1
|
-4
|
-3
|
|
HNL
|
HA
|
-6
|
-4
|
2
|
|
STL
|
F9
|
-1
|
-9
|
-8
|
The above data shows, carrier F9 improved alot over time for STL route and HA has worst improvement over months in HNL route.
In most of the table I used gt() and datatable() function. I liked the gt() function most. It has lots of parameters for customization as per the need. Also, I have seen how it grouped and represent the group_by data without mentioning it explicitly.I am not sure, but I think in datatable one can not add titles or renaming column headers. gt() function have all these additional customizations.I also used “kable” function and its also nice to represent table in paper publish format.I also, tried with kable() function. This one is also a good choice with lots of parameters to customize tables.I kept the alighment of the table as “centre”, which looks great to me.Also, in kable() function, I realise, the table header is in slightly lighter in shade. I am not very sure if it can be make dark.
FONTS
library(ggplot2)
library(extrafont)
## Warning: package 'extrafont' was built under R version 3.6.2
loadfonts(device = "win")
plot<-ggplot(improvement_final,aes(x=carrier,y=improvement_factor))+
geom_col(aes(fill=improvement_factor))+
theme(plot.title = element_text(hjust = 0.5)) +
theme(axis.text = element_text(size = 15)) +
labs(x = "Carrier", y = "Improvement in departure delay")+
theme(panel.background = element_blank(),axis.line = element_line(colour = "black"))+
theme(panel.border = element_rect(linetype = "dashed", fill = NA))+
ggtitle(~""*underline("PDX-Airlines improvement in dep_delay average"))
Serif
library(extrafont)
loadfonts(device = "win")
library(showtext)
## Warning: package 'showtext' was built under R version 3.6.3
## Warning: package 'sysfonts' was built under R version 3.6.3
## Warning: package 'showtextdb' was built under R version 3.6.3
showtext_auto()
font_add_google("Tinos")
plot+theme(text=element_text(size=12, family="Tinos",color = "brown"))

As per my findings above, the above plot also shows, F9 improved a lot over time whereas HA has worst average departure delay time.
San - Serif
font_add_google("Open Sans")
plot+theme(text=element_text(size=12, family="Open Sans",color = "brown"))

Google Display font - lobster
showtext_opts(dpi = 72)
font_add_google("Lobster", "lobster")
plot+theme(text=element_text(size=22, family="lobster",color = "brown"))

I have chosen 3 fonts for my plot texts. Serif , San serif and Display font.
Conclusion:
Tinos font within Serif categories caught my attention because it is very soothing and refreshing. Tinos font is quite similar to Times New Roman font and it looks more professional to me. This font is appropriate for axis titles in plot. I selected Open Sans within Sans Serif family. This font is very easy to read and has friendly appearance. Even though the Open Sans font was optimized for print, web, and mobile interfaces, I like it for professional graphs and presentations. Among display fronts, I selected Lobster font. I think Lobster font is good for certain display for informal presentation, but it is not good for plots. Choosing the right font depends on the purpose and the audience.